library(tidyverse)
library(ggformula)
library(janitor)
library(skimr)
library(broom)
library(readxl)
library(jsonlite)
theme_set(theme_bw())
set.seed(666)gvc_opentargets
Setup environment
Read and prep data
Genes within 1Mb window of GVC loci:
gvc <- read_xlsx("GVC_1Mb_comparison_050224.xlsx") |>
clean_names() |>
separate(gene_id, c("gene_id", "version")) |>
select(-version, -agora_nominated_list, -opentarget_info)
gvcGene prioritization scores from Agora https://www.synapse.org/Synapse:syn25741025:
ago <- read_json("syn25741025.overall_scores.json", simplifyVector = TRUE) |> as_tibble()
agosum(gvc$gene_id %in% ago$ensembl_gene_id)[1] 2248
sum(gvc$gene_symbol %in% ago$hgnc_symbol)[1] 2234
Genes from Open Targets:
ot <- read_tsv("OT-MONDO_0004975-associated-targets-5_4_2024-v24_03.tsv", show_col_types = FALSE)
otsum(ot$symbol %notin% ago$hgnc_symbol)[1] 293
library(gprofiler2)otcols <- colnames(ot)
otensg <- gconvert(
query = ot$symbol,
organism = "hsapiens",
target= "ENSG",
mthreshold = Inf,
filter_na = TRUE) |>
mutate(input_number = as.character(input_number)) |>
left_join(ot |> rownames_to_column(var = "input_number"), by = "input_number") |>
select(ensembl_gene_id = target, otcols)
otensgd <- gvc |>
left_join(ago, by = join_by(gene_id == ensembl_gene_id)) |>
left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) |>
arrange(desc(globalScore))
dskim(d)| Name | d |
| Number of rows | 2473 |
| Number of columns | 61 |
| _______________________ | |
| Column type frequency: | |
| character | 51 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| grouped_loci_rs_id_gvc | 0 | 1.00 | 6 | 11 | 0 | 146 | 0 |
| grouped_loci_gvc | 0 | 1.00 | 3 | 43 | 0 | 84 | 0 |
| chrom | 0 | 1.00 | 4 | 5 | 0 | 21 | 0 |
| rsid | 0 | 1.00 | 6 | 11 | 0 | 146 | 0 |
| ref | 0 | 1.00 | 1 | 1 | 0 | 4 | 0 |
| alt | 0 | 1.00 | 1 | 5 | 0 | 14 | 0 |
| gene_id | 0 | 1.00 | 15 | 15 | 0 | 1344 | 0 |
| gene_strand | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| gene_symbol | 0 | 1.00 | 3 | 14 | 0 | 1344 | 0 |
| gene_type | 0 | 1.00 | 14 | 14 | 0 | 1 | 0 |
| hgnc_symbol | 221 | 0.91 | 3 | 14 | 0 | 1212 | 0 |
| symbol | 1649 | 0.33 | 3 | 11 | 0 | 404 | 0 |
| otGeneticsPortal | 1649 | 0.33 | 7 | 20 | 0 | 196 | 0 |
| geneBurden | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| eva | 1649 | 0.33 | 1 | 19 | 0 | 9 | 0 |
| genomicsEngland | 1649 | 0.33 | 7 | 18 | 0 | 4 | 0 |
| gene2Phenotype | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| uniprotLiterature | 1649 | 0.33 | 7 | 18 | 0 | 3 | 0 |
| uniprotVariants | 1649 | 0.33 | 7 | 18 | 0 | 3 | 0 |
| orphanet | 1649 | 0.33 | 7 | 17 | 0 | 2 | 0 |
| clingen | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| cancerGeneCensus | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| intogen | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| evaSomatic | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| cancerBiomarkers | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| chembl | 1649 | 0.33 | 7 | 19 | 0 | 14 | 0 |
| crisprScreen | 1649 | 0.33 | 7 | 19 | 0 | 41 | 0 |
| crispr | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| slapenrich | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| progeny | 1649 | 0.33 | 7 | 7 | 0 | 1 | 0 |
| reactome | 1649 | 0.33 | 7 | 17 | 0 | 2 | 0 |
| sysbio | 1649 | 0.33 | 7 | 19 | 0 | 2 | 0 |
| europepmc | 1649 | 0.33 | 7 | 20 | 0 | 43 | 0 |
| expressionAtlas | 1649 | 0.33 | 7 | 20 | 0 | 173 | 0 |
| impc | 1649 | 0.33 | 7 | 19 | 0 | 7 | 0 |
| maxClinicalTrialPhase | 1649 | 0.33 | 1 | 7 | 0 | 4 | 0 |
| isInMembrane | 1649 | 0.33 | 1 | 7 | 0 | 3 | 0 |
| isSecreted | 1649 | 0.33 | 1 | 7 | 0 | 3 | 0 |
| hasLigand | 1649 | 0.33 | 1 | 7 | 0 | 3 | 0 |
| hasSmallMoleculeBinder | 1649 | 0.33 | 1 | 7 | 0 | 3 | 0 |
| hasPocket | 1649 | 0.33 | 1 | 7 | 0 | 3 | 0 |
| mouseOrthologMaxIdentityPercentage | 1649 | 0.33 | 1 | 20 | 0 | 272 | 0 |
| hasHighQualityChemicalProbes | 1649 | 0.33 | 1 | 7 | 0 | 3 | 0 |
| geneticConstraint | 1649 | 0.33 | 7 | 22 | 0 | 403 | 0 |
| mouseKoScore | 1649 | 0.33 | 1 | 21 | 0 | 230 | 0 |
| geneEssentiality | 1649 | 0.33 | 1 | 7 | 0 | 3 | 0 |
| hasSafetyEvent | 1649 | 0.33 | 2 | 7 | 0 | 2 | 0 |
| isCancerDriverGene | 1649 | 0.33 | 2 | 7 | 0 | 2 | 0 |
| paralogMaxIdentityPercentage | 1649 | 0.33 | 1 | 21 | 0 | 73 | 0 |
| tissueSpecificity | 1649 | 0.33 | 1 | 7 | 0 | 5 | 0 |
| tissueDistribution | 1649 | 0.33 | 1 | 7 | 0 | 5 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| chrom_start | 0 | 1.00 | 62436245.64 | 48374178.54 | 413333.00 | 31121778.00 | 51224705.00 | 94979791.00 | 233173930.00 | ▇▆▂▂▁ |
| chrom_end | 0 | 1.00 | 62436246.64 | 48374178.54 | 413334.00 | 31121779.00 | 51224706.00 | 94979792.00 | 233173931.00 | ▇▆▂▂▁ |
| distance | 0 | 1.00 | 12719.87 | 278764.16 | -499286.00 | -218912.00 | 14939.00 | 241039.00 | 499987.00 | ▆▇▇▇▇ |
| absolute_distance | 0 | 1.00 | 238261.90 | 145188.23 | 114.00 | 115708.00 | 229173.00 | 361466.00 | 499987.00 | ▇▇▇▇▆ |
| gene_start | 0 | 1.00 | 62406168.65 | 48373400.00 | 87249.00 | 31131432.00 | 50825288.00 | 95244912.00 | 233671897.00 | ▇▆▂▂▁ |
| gene_end | 0 | 1.00 | 62440568.15 | 48376158.70 | 97094.00 | 31135727.00 | 50837213.00 | 95269201.00 | 233773300.00 | ▇▆▂▂▁ |
| target_risk_score | 221 | 0.91 | 2.46 | 0.99 | 0.57 | 1.57 | 2.44 | 3.34 | 4.71 | ▆▇▇▇▂ |
| genetics_score | 221 | 0.91 | 1.66 | 0.46 | 0.57 | 1.28 | 1.64 | 2.00 | 2.90 | ▂▇▇▆▁ |
| multi_omics_score | 465 | 0.81 | 0.90 | 0.76 | 0.00 | 0.00 | 1.01 | 1.54 | 2.00 | ▇▂▂▅▅ |
| globalScore | 1649 | 0.33 | 0.13 | 0.16 | 0.00 | 0.01 | 0.06 | 0.23 | 0.84 | ▇▂▁▁▁ |
query <- ot |> arrange(desc(globalScore)) |> distinct(symbol) |> pull(symbol)
gostres <- gost(query = query,
organism = "hsapiens",
domain_scope = "annotated",
exclude_iea = TRUE,
ordered_query = TRUE,
significant = TRUE,
user_threshold = 0.005,
correction_method = "fdr")
gostres$resultgostplot(gostres, capped = FALSE, interactive = TRUE)otensg |>
left_join(ago, by = "ensembl_gene_id") |>
slice_max(globalScore, n = 100) |>
select(ensembl_gene_id, symbol, globalScore, target_risk_score, genetics_score, multi_omics_score) |>
drop_na(globalScore, target_risk_score) |>
summarize(cor = tidy(cor.test(globalScore, target_risk_score, method="spearman"))) |>
unnest(cor)